In [1]:
%autosave 0
Autosave disabled

Instructions

  • Unpack data into appropriate (sub-)folders
    • DATA-Returns-Market
    • DATA-Returns-Sectors
    • DATA-Close-Market
    • DATA-Close-Sectors
    • DATA-Fundamentals-Market
  • Run cells in Imports section
  • Run cells in Data Folder Definitions
  • Run cells in Helper Functions section
  • Run cells in section of interest, either
    • Entire Market with PCA, DBSCAN and T-SNE, or
    • Entire Market with Covariance Shrinkage, Graph Lasso and LLE, or
    • Market Sectors with Covariance Shrinkage, Graph Lasso and LLE

Acknowledgements

  • Jonathan Larkin
  • Delanie MacKensie
  • SciKit Learn website example on Visualizing the Stock Market

Imports

In [87]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib.cm as cm  

import re
import os
import sys
import time
import quandl
# import hdbscan
import logging
import datetime
import platform
import collections
import numpy as np
import pandas as pd
import pickle as pkl

from glob import glob
from scipy import stats
from scipy import linalg
from scipy.linalg import (
    toeplitz, cholesky
)
from textwrap import wrap
from datetime import datetime
from pandas import DataFrame

from six.moves.urllib.request import urlopen
from six.moves.urllib.parse import urlencode

from statsmodels.tsa.stattools import coint

from matplotlib.collections import LineCollection

from sklearn import (
    cluster, 
    covariance, 
    manifold,
    preprocessing
)

from sklearn import preprocessing
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN

from sklearn.covariance import (
    OAS,
    GraphLasso,
    LedoitWolf,
    log_likelihood, 
    ShrunkCovariance,
    empirical_covariance
)

from sklearn.model_selection import GridSearchCV

quandl.ApiConfig.api_key = 'vP-UcGRwvsHvnr9cxtTc'

Data Folder Definitions

In [90]:
operating_system = platform.system()
operating_system
Out[90]:
'Windows'
In [94]:
if operating_system == 'Windows':    
    folder_delimiter = r'\\'
elif operating_system == 'Linux':    
    folder_delimiter = r'/'
else:
    print("Unknown operating system.")
    
sectorsfolder = r'DATA-Sectors' + folder_delimiter

dictfolder_market = r'DATA-Dictionaries' + folder_delimiter  
dictfolder_sectors = r'DATA-Dictionaries-Sectors' + folder_delimiter  

returnsfolder_market = r'DATA-Returns-Market' + folder_delimiter
returnsfolder_sectors = r'DATA-Returns-Sectors' + folder_delimiter 

fundamentalsfolder = r'DATA-Fundamentals-Market' + folder_delimiter

closefolder_market = r'DATA-Close-Market' + folder_delimiter  
closefolder_sectors = r'DATA-Close-Sectors' + folder_delimiter  
openclosefolder_sectors = r'DATA-OpenClose-Sectors' + folder_delimiter 
closefolder_sectors_cleaned = r'DATA-Close-Sectors-Cleaned' + folder_delimiter 

marketcap_filepath = fundamentalsfolder + "marketcap.xlsx"
pricetobook_filepath = fundamentalsfolder + "marketcap.xlsx"
pricetoearnings_filepath = fundamentalsfolder + "marketcap.xlsx"
enterprisevalue_filepath = fundamentalsfolder + "marketcap.xlsx"
In [4]:
print ("Numpy: %s " % np.__version__)
print ("Pandas: %s " % pd.__version__)
Numpy: 1.14.2 
Pandas: 0.22.0 

Helper functions

Clustering

In [3]:
# Input:
# X should be standard returns data 
# with features as column headers 
# and rows as observation data.

def build_affinity_clusters(stock_universe, X, pricing=None):

    np.warnings.filterwarnings('ignore')
    
    # Neat trick: elementwise (i.e. row) arithmetic
    # to standardize the *column* values. Works
    # because std() returns a series that functions
    # much like a row.
    
    X /= X.std(axis=0)
    
    emp_cov, shrunk_cov, lw, oa = get_covariance(X)
    try:
        # alpha = 0.01
        # cov, icov, costs = covariance.graph_lasso(
        #                     emp_cov,  # lw.covariance_
        #                     alpha,
        #                     return_costs=True)        
        
        edge_model = covariance.GraphLassoCV() 
        edge_model.fit(X) 
        cov = edge_model.covariance_
        icov = edge_model.precision_
    
    except FloatingPointError:
        print("Failed at alpha=%s"%alpha)
        return

    # dimensions of precision matrix = NxN, 
    # where N = number of stocks
    _, labels = cluster.affinity_propagation(cov)
    n_labels = labels.max()    
    # print("Cluster labels=%s" % labels) 
    
    # map of ticker symbols to cluster numbers
    cluster_map = pd.Series(
                index=X.columns, 
                data=labels)

    # get the number of stocks in each cluster
    # NOTE: value_counts of a series yields another series
    counts = cluster_map.value_counts()
        
    nclusters = len(counts)
    print("\n")
    print("Number of clusters detected: %d" % nclusters) 
    
    max = counts.max()
    print("Cluster max size: %d" % max) 

    # Unlike DBSCAN, Affinity clustering doesn't not
    # attempt to filter out noise (in particular, it
    # does not place a lower limit on the size of a cluster)
    bigger_than_one = counts[counts > 1]    
    
    # select 3 largest clusters 
    highest3clusters = bigger_than_one.nlargest(3).index
    highest3counts = bigger_than_one[highest3clusters]         
    highest3counts = list(highest3counts)
    # print("\n")
    print("\n3 cluster labels with the highest counts=%s" % list(highest3clusters)) 
    print("3 highest counts=%s" % highest3counts)
    
    plt.barh(
        highest3clusters,
        highest3counts
    )    

    plt.title('Top 3 Cluster Member Counts')
    plt.xlabel('Stocks in Cluster')
    plt.ylabel('Cluster Number'); 

    # If pricing data is provided, plot the time series
    # of the stocks in the 3 smallest clusters.
    
    if pricing is not None:
        
        # select 3 smallest clusters 
        smallest3clusters = bigger_than_one.nsmallest(3).index
        smallest3counts = bigger_than_one[smallest3clusters]         
        smallest3counts = list(smallest3counts)

        print("\nPlot the price series of stocks in smallest clusters.\n")         
        print("3 cluster labels with the smallest counts=%s" % list(smallest3clusters))
        print("3 smallest counts=%s" % smallest3counts)
        
        for n, clust in enumerate(smallest3clusters):

            # find indices of tickers in this cluster
            idx = cluster_map[cluster_map==clust].index
            tickers = list(idx)
            
            print("Tickers in cluster %d: %s" % (clust, tickers))            

            # If the cluster size is > 1, graph the time series
            # of its stocks
            
            if smallest3counts[n] > 1:
                
                # subtract the mean of each price series
                means = np.log(pricing[tickers].mean())
                data = np.log(pricing[tickers]).sub(means)

                title='Stock Time Series for Cluster %d' % clust
                data.plot(title=title)     
    
    output_tuple = (cov, icov, labels, counts, cluster_map) 
    opfilename = "Output\\" + stock_universe + ".pkl"
    with open(opfilename, "wb") as fd:
        pkl.dump(output_tuple, fd)
    
    # return (cov, icov, labels, counts, cluster_map)  
    return output_tuple  

Huge advantage of DBSCAN clustering is that it identifies noise better than many other clustering algorithms and chooses not to include some data points in any cluster. Tickers that are rejected by DBSCAN will be assigned the pseudo-cluster number of -1.

Discard all tickers with a cluster value of -1.

clf.labels_ contains a simple 1D array (or vector) of cluster numbers. The index number of each element (or if you want to view it as a column vector, the row number of each element) corresponds to the index number of the ticker in X.

In [67]:
def build_dbscan_clusters(X, tickers, pricing=None):

    clf = DBSCAN(eps=1.5, leaf_size=30, min_samples=3)
    print (clf)
    
    clf.fit(X)
    
    dbscan_labels = clf.labels_         
    # print("Cluster labels=%s" % dbscan_labels)      

    # map of ticker symbols to cluster numbers
    dbscan_cluster_map_all = pd.Series(
                index=tickers, 
                data=dbscan_labels)  
    
    # tickers associated with cluster -1 are noise
    # remove them
    dbscan_cluster_map = dbscan_cluster_map_all[
                dbscan_cluster_map_all != -1]
    
    # get the number of stocks in each cluster
    # NOTE: value_counts of a series yields another series
    dbscan_counts = dbscan_cluster_map.value_counts()

    dbscan_nclusters = len(dbscan_counts)
    # print("\n")
    print("Number of DBSCAN clusters detected: %d" % dbscan_nclusters) 

    # number of pairs we can create from clusters
    npairs = (dbscan_counts * (dbscan_counts - 1)).sum()
    print("Number of pairs we can create from DBSCAN clusters: %d" % npairs)    

    # We can't access dictionary keys using values in a deterministic
    # manner, because potentially different keys could be associated
    # with the same value. But we're must looking for a sample of the
    # largest clusters, so that doesn't matter here.    
    max = dbscan_counts.max()    
    num_largestcluster = dict((v,k) for k,v in dbscan_counts.items()).get(max)    
    
    print ("Maximum cluster size (in cluster %d): %d" % (
            num_largestcluster, max))  
    
    # Clusters with 1 member are not interesting
    bigger_than_one = dbscan_counts[counts > 1] 

    # select 3 largest clusters 
    highest3clusters = bigger_than_one.nlargest(3).index
    highest3counts = bigger_than_one[highest3clusters]         
    highest3counts = list(highest3counts)
    
    print("\n3 cluster labels with the highest counts=%s" % list(highest3clusters)) 
    print("3 highest counts=%s" % highest3counts)  
    
    plt.barh(
        highest3clusters,
        highest3counts
    )
    
    plt.title('Top 3 Cluster Member Counts')
    plt.xlabel('Stocks in Cluster')
    plt.ylabel('Cluster Number');

    # If pricing data is provided, plot the time series
    # of the stocks in the 3 smallest clusters.
    
    if pricing is not None:
        
        # select 3 smallest clusters 
        smallest3clusters = bigger_than_one.nsmallest(3).index
        smallest3counts = bigger_than_one[smallest3clusters]         
        smallest3counts = list(smallest3counts)
        
        print("\nPlot the price series of stocks in smallest clusters.\n")         
        print("3 cluster labels with the smallest counts=%s" % list(smallest3clusters))
        print("3 smallest counts=%s" % smallest3counts)

        for n, clust in enumerate(smallest3clusters):

            # find indices of tickers in this cluster
            idx = dbscan_cluster_map[dbscan_cluster_map==clust].index
            tickers = list(idx)
         
            print("Tickers in cluster %d: %s" % (clust, tickers))            
            
            # If the cluster size is > 1, graph the time series
            # of its stocks
            
            if smallest3counts[n] > 1:
                
                # subtract the mean of each price series
                means = np.log(pricing[tickers].mean())
                data = np.log(pricing[tickers]).sub(means)

                title='Stock Time Series for Cluster %d' % clust
                data.plot(title=title)       
    
    return (dbscan_cluster_map_all, 
            dbscan_cluster_map, 
            dbscan_counts, 
            dbscan_labels
    )      
In [6]:
def list_clusters(tickers, clusterlabels):

    # map of ticker symbols to cluster numbers
    cluster_map = pd.Series(
                index=tickers.columns, 
                data=clusterlabels)

    dic = {}

    for x in clustermap.index:

        cluster = clustermap[x]
        if cluster in dic.keys():
            dic[cluster].append(x) 
        else:        
            dic[cluster] = [str(x)] 
        
    return dic

Cointegration

find_cointegrated_pairs: find cointegrated pairs in a cluster.

In [7]:
# Acknowledgement:

# This function was created by Delanie MacKensie and
# posted on the Quantopian website here:  
# https://www.quantopian.com/lectures/introduction-to-pairs-trading

def find_cointegrated_pairs(data, significance=0.05):   
    
    n = data.shape[1]
    score_matrix = np.zeros((n, n))
    pvalue_matrix = np.ones((n, n))
    keys = data.keys()
    pairs = []
    
    for i in range(n):
        for j in range(i+1, n):
            
            S1 = data[keys[i]]
            S2 = data[keys[j]] 
            
            result = coint(S1, S2)
            
            score = result[0]
            pvalue = result[1] 
            
            score_matrix[i, j] = score
            pvalue_matrix[i, j] = pvalue 
            
            if pvalue < significance:
                pairs.append((keys[i], keys[j]))
                
    return score_matrix, pvalue_matrix, pairs
In [8]:
# generate dictionary of cointegrated pairs

def gen_dictionary_coint_pairs(pricing, cluster_map, counts): 
    
    cluster_dict = {}
    
    aggregated_fraction = 0
    n = 0

    for i, clust in enumerate(counts.index):

        # fetch the tickers for this cluster
        tickers = cluster_map[cluster_map == clust].index

        num_tickers = len(tickers)
        if num_tickers < 2:
            continue
        
        score_matrix, pvalue_matrix, pairs = find_cointegrated_pairs(
            pricing[tickers]
        )

        # build a dictionary whose keys are cluster numbers 
        # each whose elements contains yet another dictionary
        # with keys 'pairs', 'score_matrix' and 'pvalue_matrix'
        # Values of the keys in the nested dictionary will be
        # as follows
        # -- 'pairs' key:          A list of pairs
        # -- 'score_matrix' key:   An array of pvalues
        # -- 'pvalue_matrix' key:  An array of scores    
        
        num_pairs = len(pairs)
        num_potential_pairs = num_tickers * (num_tickers - 1) / 2
        
        # print("num_pairs=%d" % num_pairs)
        # print("num_tickers=%d" % num_tickers)
        # print("num_potential_pairs=%d" % num_potential_pairs)
        
        # fraction of pairs in cluster that cointegrate
        f = num_pairs / num_potential_pairs 
        
        cluster_dict[clust] = {}
        cluster_dict[clust]['score_matrix'] = score_matrix
        cluster_dict[clust]['pvalue_matrix'] = pvalue_matrix
        cluster_dict[clust]['pairs'] = pairs   
        cluster_dict[clust]['cluster_size'] = num_tickers
        cluster_dict[clust]['number_potential_pairs'] =  num_potential_pairs
        cluster_dict[clust]['number_of_pairs'] = num_pairs
        cluster_dict[clust]['fraction_successful'] = f
    
        aggregated_fraction = aggregated_fraction + f
        n = n + 1
    
    aggregated_fraction /= n

    return cluster_dict, aggregated_fraction
In [9]:
def aggregate_pairs_allclusters(stock_universe, pricing, cluster_map, counts):

    cluster_dict, fraction_successful = gen_dictionary_coint_pairs(
                        pricing, 
                        cluster_map, 
                        counts)
    
    # aggregate cointegrated pairs from all clusters
    aggregated_pairs = []
    for clust in cluster_dict.keys():
        aggregated_pairs.extend(cluster_dict[clust]['pairs'])  

    print("We found %d pairs." % len(aggregated_pairs))
    print("Containing %d unique tickers." % len(
          np.unique(aggregated_pairs)))   
    
    print("Fraction (on the average) of pairs that cointegrate = %f" % (
           fraction_successful))    

    opfilename = "Output\\" + stock_universe + ".clust_dict.pkl"
    with open(opfilename, "wb") as fd:
        pkl.dump(cluster_dict, fd)    
    
    opfilename = "Output\\" + stock_universe + ".pairs.pkl"
    with open(opfilename, "wb") as fd:
        pkl.dump(aggregated_pairs, fd)
        
    return (aggregated_pairs, cluster_dict)

partially_correlated_pairs

Return DataFrame with partially correlated pairs, their ticker symbols and their partial correlation value.

In [10]:
def partially_correlated_pairs(stock_universe, returns, icov, minvalue):
    
    num_stocks = len(returns.columns)

    idx_to_stock = pd.Series(
           index=np.arange(num_stocks), 
           data=returns.columns)

    filtered = (np.abs(np.triu(icov, k=1)) > minvalue)   
    values = np.abs(icov[filtered]) 
    x, y = np.where(filtered)   

    indices = list(zip(x, y))

    df = pd.DataFrame()
    df['indices'] = indices
    df['values'] = values 
    s = pd.Series(data=np.zeros(len(df)))

    f = lambda tup: tuple(idx_to_stock[x] for x in tup)
    for i in df.index:    
        t = df.loc[i]['indices']
        pair = f(t)
        s.loc[i] = pair 

    df['pairs'] = s
    df    
    
    df = df.sort_values('values', ascending=False) 

    corrval = "%.3f" % minvalue
    opfn = "Output\\" + stock_universe 
    opfn = opfn + ".pairs.incov." + corrval 
    opfilename = opfn + ".pkl"
    with open(opfilename, "wb") as fd:
        pkl.dump(df, fd)    
    
    return df

find_coint_partially_corr_pairs:

Check partially correlated pairs identified by the precision matrix to see if any of them are cointegrated.

In [11]:
def find_coint_partially_corr_pairs(df):
    
    significance = 0.05
    pairs = []

    for x in df['pairs']:    
        pair = list(x)
        ticker1 = pair[0]
        ticker2 = pair[1]
        S1 = pricing[ticker1]
        S2 = pricing[ticker2] 

        result = coint(S1, S2)

        score = result[0]
        pvalue = result[1] 

        if pvalue < significance:
            pairs.append((ticker1, ticker2))   
    
    num_potential_pairs = len(df)
    if num_potential_pairs < 1:
        
        print("Finding cointegrated pairs of partially correlated stocks:")
        print("Number of potentially cointegrated pairs = %d" % (
        num_potential_pairs))        
        
        return
    
    num_pairs = len(pairs)
    
    print("Finding cointegrated pairs of partially correlated stocks:")
    print("Number of potentially cointegrated pairs = %d" % (
        num_potential_pairs))
    print("Number of conintegrated pairs = %d" % num_pairs)
    str_fraction = "%.3f" % (num_pairs / num_potential_pairs)
    print("Fraction of partially corr pairs that cointegrate = %s" % (
        str_fraction))
    
    return pairs

Covariance Shrinkage

Test soundness of covariance matrix

In [12]:
def negative_eigenvalues(cov):
    w, v = linalg.eig(cov)
    return np.any(np.less(w, 0))
In [13]:
def maxmin_eigenvalues(cov):
    w, v = linalg.eig(cov)
    return np.max(w), np.min(w)
In [14]:
def get_covariance(X):
    
    # Empirical Covariance
    
    # X : ndarray of shape (n_samples, n_features)
    emp_cov = covariance.empirical_covariance(X)
    
    shrunk_cov = covariance.shrunk_covariance(
                emp_cov, shrinkage=0.9) 

    X_T = X.T.copy()

    try:
    
        train_len = int(0.9 * len(X_T)) + 1
        test_len = int(0.1 * len(X_T)) 
        X_train = X_T[:train_len]
        X_test = X_T[train_len:] 
        
        assert (len(X_train) + len(X_test) == len(X_T))

        # Ledoit-Wolf optimal shrinkage coefficient estimate
        lw = LedoitWolf()
        lw.fit(X_train)

        # OAS coefficient estimate
        oa = OAS()
        oa.fit(X_train)
        
        return emp_cov, shrunk_cov, lw, oa
        
    except AssertionError:
        print("My Error: Size of train and test sets don't add up to propert value.")    

Embedding

In [15]:
# Input:
# X should be standard returns data 
# with features as column headers 
# and rows as observation data.

def get_LLE_embeddings(X):    

    # Neat trick: elementwise (i.e. row) arithmetic
    # to standardize the *column* values. Works
    # because std() returns a series that functions
    # much like a row.
        
    X /= X.std(axis=0)
    
    model = manifold.LocallyLinearEmbedding(
        n_components = 2, 
        eigen_solver = 'dense', 
        n_neighbors  = 6)

    # If tickers label columns, we must transpose X 
    # If tickers label rows, no need to transpose
    # embedding = node_position_model.fit_transform(X.T).T
    # embedding = node_position_model.fit_transform(returns.T)
    embedding = model.fit_transform(X.T)
    print("embedding.shape=%s" % str(embedding.shape))

    # We don't transpose embedding because plt.scatter() 
    # requires a transpose of its dimensions but because 
    # it requires two lists that happen to correspond to 
    # the columns of embedding. By transposing embedding 
    # we can access the two lists by simplying selecting 
    # the rows with indices 0 and 1.

    embed0 = embedding.T[0]
    embed1 = embedding.T[1] 
    print("embed0.shape=%s" % str(embed0.shape))
    print("embed1.shape=%s" % str(embed1.shape))

    return embed0, embed1, embedding
In [16]:
def get_TSNE_embeddings(X):    

    # If tickers label columns, we must transpose X 
    # If tickers label rows, no need to transpose
    # embedding = node_position_model.fit_transform(X.T).T
    # embedding = node_position_model.fit_transform(returns.T)
    embedding = TSNE(learning_rate=1000, perplexity=25, 
                     random_state=1337).fit_transform(X)    

    print("embedding.shape=%s" % str(embedding.shape))

    # We don't transpose embedding because plt.scatter() 
    # requires a transpose of its dimensions but because 
    # it requires two lists that happen to correspond to 
    # the columns of embedding. By transposing embedding 
    # we can access the two lists by simplying selecting 
    # the rows with indices 0 and 1.

    embed0 = embedding.T[0]
    embed1 = embedding.T[1] 
    print("embed0.shape=%s" % str(embed0.shape))
    print("embed1.shape=%s" % str(embed1.shape))

    return embed0, embed1, embedding

Plotting

In [18]:
def do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=1000000):
    
    # d = 1 / np.sqrt(np.diag(icov))   
    
    ax = fig.add_subplot(1,1,1)   
    plt.axis('off')
    
    # sizes = dotsize * np.linspace(0, 1, len(embed0))
    sizes = dotsize * counts.values
    
    ax.scatter(embed0, embed1, s=sizes, c=labels, cmap=plt.cm.nipy_spectral)   
                               
    # ax.scatter(embed0, embed1, s=dotsize * d ** 2,                
    #            c=labels, cmap=plt.cm.nipy_spectral)      
In [19]:
def do_noisy_scatter_plot(x, y, x_noise, y_noise, labels, fig):
    
    # plt.clf()
    # plt.axis('off')

    ax = fig.add_subplot(1,1,1) 
    
    ax.scatter(
        x,
        y,
        s=100,
        alpha=0.85,
        c=labels[labels!=-1],
        cmap=cm.Paired
    )

    ax.scatter(
        x_noise,
        y_noise,
        s=100,
        alpha=0.05
    )

    title = "T-SNE of all Stocks with DBSCAN Clusters Noted"
    plt.title(title); 
In [20]:
def do_LLE_graph(X, icov, labels, embedding, fig):
    
    # Display a graph of the partial correlations
    # partial_correlations = edge_model.precision_.copy()
    # d = 1 / np.sqrt(np.diag(partial_correlations))
    
    n_labels = len(labels)
    
    ax = fig.add_subplot(1,1,1) 
    
    d = 1 / np.sqrt(np.diag(icov))
    icov *= d
    icov *= d[:, np.newaxis]
    non_zero = (np.abs(np.triu(icov, k=1)) > 0.02)   
    
    if not non_zero.any():
        print("No off-diagonal elements found in the precision matrix.")
        return

    # plt.figure(1, facecolor='w', figsize=(10, 8))
    # plt.clf()
    # ax = plt.axes([0., 0., 1., 1.])
    plt.axis('off')

    embedding = embedding.T
    
    embed0 = embedding[0]
    embed1 = embedding[1]
    
    # plt.scatter(embed0, embed1, s=100 * d ** 2, 
    ax.scatter(embed0, embed1, s=100 * d ** 2, 
                c=labels, cmap=plt.cm.nipy_spectral)    
    
    # tickers = returns.T.index
    tickers = X.T.index
    names = list(tickers)

    # Plot the edges
    start_idx, end_idx = np.where(non_zero)
    segments = [[embedding[:, start], embedding[:, stop]]
                for start, stop in zip(start_idx, end_idx)]
    values = np.abs(icov[non_zero])
    lc = LineCollection(segments,
                        zorder=0, cmap=plt.cm.hot_r,
                        norm=plt.Normalize(0, .7 * values.max()))
    lc.set_array(values)
    lc.set_linewidths(15 * values)
    ax.add_collection(lc)

    # Add a label to each node. The challenge here is that we want to
    # position the labels to avoid overlap with other labels
    for index, (name, label, (x, y)) in enumerate(
            zip(names, labels, embedding.T)):

        dx = x - embedding[0]
        dx[index] = 1
        dy = y - embedding[1]
        dy[index] = 1
        this_dx = dx[np.argmin(np.abs(dy))]
        this_dy = dy[np.argmin(np.abs(dx))]
        if this_dx > 0:
            horizontalalignment = 'left'
            x = x + .002
        else:
            horizontalalignment = 'right'
            x = x - .002
        if this_dy > 0:
            verticalalignment = 'bottom'
            y = y + .002
        else:
            verticalalignment = 'top'
            y = y - .002
        plt.text(x, y, name, size=10,
                 horizontalalignment=horizontalalignment,
                 verticalalignment=verticalalignment,
                 bbox=dict(facecolor='w',
                           edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
                           alpha=.6))

    plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
             embedding[0].max() + .10 * embedding[0].ptp(),)
    plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
             embedding[1].max() + .03 * embedding[1].ptp())

    # plt.show()    
In [21]:
# For the PCA/DBSCAN case, we have a 50+ dimensional 
# representation of the stocks, and we will embed those 
# 50+ dimensional vectors in 2D space using TSNE, then
# we will plot the points and add lines connecting
# points within each pair.
    
def visualize_cointegrated_pairs(X, returns, embedding, pairs, cluster_map, fig):

    if len(pairs) == 0:
        print("No pairs to visualize.")
        return
    
    ax = fig.add_subplot(1,1,1)     
    plt.axis('off')
    
    # map of tickers found in pairs to their clusters
    stocks = list(np.unique(pairs))
    in_pairs_series = cluster_map.loc[stocks]
    print("Size of map: %s" % str(in_pairs_series.shape))    
    
    # build DataFrame of returns for stocks included 
    # in cointegrated pairs
    
    # X_df has tickers for its index, so X_df.loc[stocks] 
    # will yield vectors of dimension 54 (principal components 
    # of dimension 50 + 4 dimensions for fundamentals) for 
    # all stocks found in some pair or other.
    X_df = pd.DataFrame(index=returns.T.index, data=X)

    # X_pairs is not a list of pairs, but a list of the 
    # stock tickers found within some pair or other. It is 
    # a subset of of X_df and has the same DataFrame format. 
    X_pairs = X_df.loc[stocks]
    print("X_pairs.shape=%s" % str(X_pairs.shape))    
    
    # Get the embedded representations of all the stocks
    # present in one or more of the cointegrated pairs

    emb = pd.DataFrame(columns=['x', 'y'])
    
    for pair in pairs:

        # Row numbers of array X_tsne correspond to index
        # numbers of tickers in X_pairs. Each row of X_tsne
        # contains a 2-tuple with coordinates for the ticker
        # in embedded T-SNE space.

        # ticker1 = pair[0].symbol << Quantopian ticker object
        ticker1 = pair[0]
        loc1 = X_pairs.index.get_loc(pair[0])
        x1, y1 = embedding[loc1, :]
        emb.loc[loc1] = [x1, y1]

        # ticker2 = pair[0].symbol << Quantopian ticker object
        ticker2 = pair[1]
        loc2 = X_pairs.index.get_loc(pair[1])
        x2, y2 = embedding[loc2, :]
        emb.loc[loc2] = [x2, y2]

        # plot the lines between the dots that will be
        # drawn later after this loop completes
        plt.plot([x1, x2], [y1, y2], 'k-', alpha=0.3, c='gray');

    cmvals = in_pairs_series.values / max(in_pairs_series.values)    

    plt.scatter(emb['x'], emb['y'], s=220, 
                # alpha=0.9, c=[in_pairs_series.values], 
                alpha=0.9, c=cmvals, 
                cmap=cm.Paired)
    plt.title('T-SNE Visualization of Validated Pairs');
In [22]:
def do_data_scaling(X_Clean, year_range, num_obs):    
    
    myScaler = preprocessing.StandardScaler()
    XScaled = myScaler.fit_transform(X_Clean)
    X = XScaled.copy()
    
    return X 

Reading Data

In [23]:
def get_market_returns_data(returns_file):
    
    returns_filepath = returnsfolder_market + returns_file

    with open(returns_filepath, 'rb') as fd:
        returns = pkl.load(fd)  

    print("returns.shape=%s" % str(returns.shape))    
    
    return returns
In [24]:
def get_market_pricing_data(closing_prices_file):
    
    pricing_filepath = closefolder_market + closing_prices_file    

    with open(pricing_filepath, 'rb') as fd:
        pricing = pkl.load(fd)  

    print("pricing.shape=%s" % str(pricing.shape))    
    
    return pricing
In [25]:
def get_sector_returns_data(returns_file):
    
    returns_filepath = returnsfolder_sectors + returns_file

    with open(returns_filepath, 'rb') as fd:
        returns = pkl.load(fd)  

    print("returns.shape=%s" % str(returns.shape))    
    
    return returns
In [26]:
def get_sector_pricing_data(closing_prices_file):
    
    pricing_filepath = closefolder_sectors + closing_prices_file    

    with open(pricing_filepath, 'rb') as fd:
        pricing = pkl.load(fd)  

    print("pricing.shape=%s" % str(pricing.shape))    
    
    return pricing

Entire Market

In [95]:
universe = "EntireMarket"

returns_file = "returns_2000-2008_len_2485.pkl"
closing_prices_file = "pricing_2000-2008_len_2485.pkl"

returns = get_market_returns_data(returns_file)    
pricing = get_market_pricing_data(closing_prices_file)
returns.shape=(2009, 1797)
pricing.shape=(2010, 1797)

PCA

  • Reduce dimensions of returns using PCA
  • Aggregate fundamental data to principal components
  • Standardize vectors

For PCA, prune returns and pricing so that they hold only those stocks for which we have fundamentals data.

In [55]:
returnsUnpruned = returns.copy()
pricingUnpruned = pricing.copy()

retT = returns.T  
pricingT = pricing.T    
In [56]:
marketcap_filepath = fundamentalsfolder + "marketcap.xlsx"
pricetobook_filepath = fundamentalsfolder + "price_to_book.xlsx"
pricetoearnings_filepath = fundamentalsfolder + "price_to_earnings.xlsx"
enterprisevalue_filepath = fundamentalsfolder + "enterprise_value.xlsx"
In [57]:
marketcap = pd.read_excel(marketcap_filepath)
marketcap = marketcap.set_index('Ticker') 
pricetobook = pd.read_excel(pricetobook_filepath)
pricetobook = pricetobook.set_index('Ticker') 
pricetoearnings = pd.read_excel(pricetoearnings_filepath)
pricetoearnings = pricetoearnings.set_index('Ticker') 
enterprisevalue = pd.read_excel(enterprisevalue_filepath)
enterprisevalue = enterprisevalue.set_index('Ticker') 
In [58]:
retTIndex = retT.index
pricingTIndex = pricingT.index
marketcapIndex = marketcap.index
pricetobookIndex = pricetobook.index
pricetoearningsIndex = pricetoearnings.index
enterprisevalueIndex = enterprisevalue.index
In [59]:
idx = retTIndex.copy()
idx = idx.intersection(pricingTIndex)
idx = idx.intersection(marketcapIndex)
idx = idx.intersection(pricetobookIndex)
idx = idx.intersection(pricetoearningsIndex)
idx = idx.intersection(enterprisevalueIndex) 
print("idx.shape=%s" % str(idx.shape))
idx.shape=(1101,)
In [60]:
marketcap = marketcap.loc[
    marketcap.index.intersection(idx)]
pricetobook = pricetobook.loc[
    pricetobook.index.intersection(idx)]
pricetoearnings = pricetoearnings.loc[
    pricetoearnings.index.intersection(idx)]
enterprisevalue = enterprisevalue.loc[
    enterprisevalue.index.intersection(idx)]
retT = retT.loc[
    retT.index.intersection(idx)]
pricingT = pricingT.loc[
    pricingT.index.intersection(idx)]
In [61]:
returns = retT.T
pricing = pricingT.T

print("returns.shape=%s" % str(returns.shape))
print("pricing.shape=%s" % str(pricing.shape))
returns.shape=(2009, 1101)
pricing.shape=(2010, 1101)
In [62]:
N_PRIN_COMPONENTS = 50
pca = PCA(n_components=N_PRIN_COMPONENTS)
pca.fit(returns) 
Out[62]:
PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)
In [63]:
print ("marketcap:\t\t%s" % str(marketcap.shape))
print ("pricetobook:\t\t%s" % str(pricetobook.shape))
print ("pricetoearnings:\t%s" % str(pricetoearnings.shape))
print ("enterprisevalue:\t%s" % str(enterprisevalue.shape))
marketcap:		(1101, 1)
pricetobook:		(1101, 1)
pricetoearnings:	(1101, 1)
enterprisevalue:	(1101, 1)
In [64]:
X = np.hstack(
    (pca.components_.T,
     marketcap['Marketcap'].values[:, np.newaxis],
     pricetobook['Pricetobook'].values[:, np.newaxis], 
     pricetoearnings['Price to Earnings'].values[:, np.newaxis],
     enterprisevalue['Enterprisevalue'].values[:, np.newaxis]     
    )
)
print("X.shape=%s" % str(X.shape))
X.shape=(1101, 54)
In [65]:
X = preprocessing.StandardScaler().fit_transform(X) 
print("X.shape=%s" % str(X.shape))
X.shape=(1101, 54)

DBSCAN using principal components

In [68]:
clustering_results = build_dbscan_clusters(X, returns.columns, pricing)

# dbscan_cluster_map_all includes datapoints 
# belonging to cluster -1 (i.e. datapoints
# belonging to no cluster)
dbscan_cluster_map_all = clustering_results[0] 
dbscan_cluster_map = clustering_results[1]
dbscan_counts = clustering_results[2]
dbscan_labels = clustering_results[3]
DBSCAN(algorithm='auto', eps=1.5, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=3, n_jobs=1, p=None)
Number of DBSCAN clusters detected: 5
Number of pairs we can create from DBSCAN clusters: 52482
Maximum cluster size (in cluster 0): 228

3 cluster labels with the highest counts=[0, 3]
3 highest counts=[228, 4]

Plot the price series of stocks in smallest clusters.

3 cluster labels with the smallest counts=[3, 0]
3 smallest counts=[4, 228]
Tickers in cluster 3: ['CAH', 'HRC', 'MRK', 'PFE']
Tickers in cluster 0: ['ABCB', 'ACGL', 'ADC', 'AFG', 'AFL', 'AIV', 'AKR', 'ALB', 'ALG', 'ALL', 'ALX', 'AME', 'ANAT', 'APD', 'ARE', 'AROW', 'ARTNA', 'AVB', 'AVY', 'AWR', 'BBT', 'BC', 'BCPC', 'BDGE', 'BDN', 'BEN', 'BFS', 'BGG', 'BHB', 'BMRC', 'BMS', 'BMTC', 'BOH', 'BOKF', 'BUSE', 'BXP', 'CAC', 'CAG', 'CASH', 'CASS', 'CB', 'CBL', 'CBSH', 'CBU', 'CCBG', 'CCF', 'CFFN', 'CFR', 'CHD', 'CHMG', 'CINF', 'CLI', 'CMA', 'CNA', 'CNBKA', 'COKE', 'COLB', 'CPB', 'CPK', 'CPT', 'CSL', 'CTO', 'CUZ', 'CW', 'CWT', 'CZNC', 'DCI', 'DCOM', 'DEL', 'DJCO', 'DRE', 'EFX', 'EGP', 'ELS', 'ELY', 'EMN', 'EPR', 'EQR', 'ERIE', 'ESGR', 'ESS', 'FCBC', 'FDEF', 'FFIC', 'FHN', 'FITB', 'FLIC', 'FMBI', 'FNB', 'FNLC', 'FR', 'FRT', 'FULT', 'GGP', 'GHC', 'GIS', 'GPC', 'GRC', 'GSBC', 'GTY', 'GWW', 'HBAN', 'HFWA', 'HIW', 'HNI', 'HPT', 'HRL', 'HWKN', 'IBKC', 'IFF', 'IOSP', 'IP', 'IRET', 'JJSF', 'JLL', 'K', 'KEY', 'KIM', 'KMPR', 'KMT', 'KRC', 'KWR', 'LEG', 'LION', 'LNC', 'LNCE', 'SBCF', 'SCL', 'LXP', 'MAC', 'MBFI', 'MBWM', 'MCO', 'MCY', 'MDP', 'MFA', 'MKL', 'MLHR', 'MLM', 'MMM', 'MO', 'MTB', 'MTX', 'NC', 'NLY', 'NPK', 'NRIM', 'NTRS', 'NWL', 'NWLI', 'NYT', 'O', 'OFC', 'OLP', 'ONB', 'OPY', 'ORI', 'PBCT', 'PCH', 'PEP', 'PG', 'PII', 'PLD', 'PNC', 'PRK', 'PSA', 'PSB', 'PWOD', 'PX', 'R', 'RE', 'RF', 'RLI', 'RNST', 'RPT', 'RWT', 'RYN', 'SAM', 'SJI', 'SJM', 'SJW', 'SLG', 'SLM', 'SNV', 'SON', 'SPG', 'STAR', 'STI', 'STL', 'STRT', 'SUI', 'SWK', 'SWM', 'SXI', 'SYBT', 'TCBK', 'TFX', 'TGNA', 'TMK', 'TMP', 'TR', 'TRMK', 'TROW', 'UBA', 'UDR', 'UHT', 'UMBF', 'USB', 'UTL', 'VLY', 'VMC', 'VNO', 'VTR', 'WABC', 'WAFD', 'WBS', 'WEYS', 'WMK', 'WRE', 'WRI', 'WSFS', 'WST', 'WTBA', 'WTFC', 'WY', 'Y', 'YORW', 'ZION']
In [69]:
embed0, embed1, embedding = get_TSNE_embeddings(X)
embedding.shape=(1101, 2)
embed0.shape=(1101,)
embed1.shape=(1101,)

Points in clusters are colored and opaque. Points not belonging to a cluster (i.e. noise) are gray and translucent.

In [58]:
X_tsne = embedding

# The logical expression returns a  
# a list of boolean values
x = X_tsne[(dbscan_labels!=-1), 0]
y = X_tsne[(dbscan_labels!=-1), 1]

# The logical expression returns a  
# **series** whose values are booleans
x_noise = X_tsne[(dbscan_cluster_map_all==-1).values, 0]
y_noise = X_tsne[(dbscan_cluster_map_all==-1).values, 1]
In [59]:
fig = plt.figure(1, facecolor='w', figsize=(10, 8))

do_noisy_scatter_plot(x, y, x_noise, y_noise, dbscan_labels, fig)

Cointegration - Test for Cointegrated Pairs

Apply formula $\frac{N(N - 1)}{2}$ elementwise to each cluster, then sum the results to get the combinatorial sum of possible pairs.

In [83]:
print ("Total pairs possible in universe: %d " % 
       (counts*(counts-1)/2).sum()) 
Total pairs possible in universe: 26241 

Detect cointegrated pairs in all clusters, then aggregate the results into a single list of pairs.

In [84]:
pair_data = aggregate_pairs_allclusters (
            universe,
            pricing, 
            dbscan_cluster_map, 
            dbscan_counts)

pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
We found 1906 pairs.
Containing 244 unique tickers.
Fraction (on the average) of pairs that cointegrate = 0.059298

Cointegrated Pair Visualization

Lastly, for the pairs we found and validated, let's visualize them in 2d space with T-SNE again.

In [66]:
fig = plt.figure(1, facecolor='w', figsize=(10, 8))  

visualize_cointegrated_pairs(
    X,
    returns, 
    X_tsne,
    pairs_allclusters, 
    dbscan_cluster_map, 
    fig)
Size of map: (232,)
X_pairs.shape=(232, 54)

Covariance Shrinkage, Graph Lasso, LLE

In [554]:
universe = "EntireMarket"

returns_file = "returns_2000-2008_len_2485.pkl"
pricing_file = "pricing_2000-2008_len_2485.pkl"
returns = get_market_returns_data(returns_file)
pricing = get_market_pricing_data(pricing_file)
returns.shape=(2009, 1797)
pricing.shape=(2010, 1797)

Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.

In [28]:
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " % 
       (ticker_count*(ticker_count-1)/2)) 
Total pairs possible in universe: 1613706 
In [555]:
cluster_info = build_affinity_clusters(universe, returns, pricing)  

cov = cluster_info[0]
icov = cluster_info[1] 
labels = cluster_info[2] 
counts = cluster_info[3] 
cluster_map = cluster_info[4]

Number of clusters detected: 267
Cluster max size: 27

3 cluster labels with the highest counts=[264, 27, 5]
3 highest counts=[27, 26, 24]

Plot the price series of stocks in smallest clusters.

3 cluster labels with the smallest counts=[47, 19, 74]
3 smallest counts=[2, 2, 2]
Tickers in cluster 47: ['CBL', 'RTI']
Tickers in cluster 19: ['APOL', 'PVA']
Tickers in cluster 74: ['DWSN', 'TGE']
In [556]:
embed0, embed1, embedding = get_LLE_embeddings(returns)  
embedding.shape=(1797, 2)
embed0.shape=(1797,)
embed1.shape=(1797,)
In [557]:
fig = plt.figure(1, facecolor='w', figsize=(7, 5))

do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=10)
In [558]:
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig) 

Detect cointegrated pairs in all clusters, then aggregate the results into a single list of pairs.

In [788]:
pair_data = aggregate_pairs_allclusters (
            universe,
            pricing, 
            cluster_map, 
            counts)

pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
We found 1077 pairs.
Containing 913 unique tickers.
Fraction (on the average) of pairs that cointegrate = 0.151424
In [560]:
fig = plt.figure(1, facecolor='w', figsize=(8, 6))  

visualize_cointegrated_pairs(
    returns.T,
    returns, 
    embedding,
    pairs_allclusters, 
    cluster_map, 
    fig)
Size of map: (913,)
X_pairs.shape=(913, 2009)
In [795]:
df = partially_correlated_pairs(universe, returns, icov, 0.4)
In [796]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 13
Number of conintegrated pairs = 5
Fraction of partially corr pairs that cointegrate = 0.385
In [789]:
df = partially_correlated_pairs(universe, returns, icov, 0.1)
In [790]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 49
Number of conintegrated pairs = 15
Fraction of partially corr pairs that cointegrate = 0.306
In [793]:
df = partially_correlated_pairs(universe, returns, icov, 0.01)
In [794]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 825
Number of conintegrated pairs = 209
Fraction of partially corr pairs that cointegrate = 0.253

Market Sectors

Basic Materials

In [29]:
universe = "Basic Materials"
 
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
returns.shape=(2009, 21)
pricing.shape=(2010, 21)

Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.

In [35]:
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " % 
       (ticker_count*(ticker_count-1)/2)) 
Total pairs possible in universe: 210 
In [30]:
cluster_info = build_affinity_clusters(universe, returns, pricing)  

cov = cluster_info[0]
icov = cluster_info[1] 
labels = cluster_info[2] 
counts = cluster_info[3] 
cluster_map = cluster_info[4]

Number of clusters detected: 5
Cluster max size: 11

3 cluster labels with the highest counts=[0, 3]
3 highest counts=[11, 7]

Plot the price series of stocks in smallest clusters.

3 cluster labels with the smallest counts=[3, 0]
3 smallest counts=[7, 11]
Tickers in cluster 3: ['AKS', 'CENX', 'CLF', 'FCX', 'NUE', 'SCCO', 'X']
Tickers in cluster 0: ['APD', 'ALB', 'ATR', 'CBT', 'FMC', 'IP', 'OLN', 'PX', 'SWM', 'SMG', 'SON']
In [31]:
embed0, embed1, embedding = get_LLE_embeddings(returns)     
embedding.shape=(21, 2)
embed0.shape=(21,)
embed1.shape=(21,)
In [32]:
fig = plt.figure(1, facecolor='w', figsize=(7, 5))

do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
In [33]:
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig) 
In [34]:
pair_data = aggregate_pairs_allclusters (
            universe,
            pricing, 
            cluster_map, 
            counts)

pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
We found 17 pairs.
Containing 15 unique tickers.
Fraction (on the average) of pairs that cointegrate = 0.213420
In [808]:
fig = plt.figure(1, facecolor='w', figsize=(8, 6))  

visualize_cointegrated_pairs(
    returns.T,
    returns, 
    embedding,
    pairs_allclusters, 
    cluster_map, 
    fig)
Size of map: (15,)
X_pairs.shape=(15, 2009)
In [812]:
df = partially_correlated_pairs(universe, returns, icov, 0.4)
In [813]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 0

Cyclical Consumer

In [36]:
universe = "Cyclical Consumer"
 
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
returns.shape=(2009, 111)
pricing.shape=(2010, 111)

Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.

In [37]:
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " % 
       (ticker_count*(ticker_count-1)/2)) 
Total pairs possible in universe: 6105 
In [825]:
cluster_info = build_affinity_clusters(universe, returns, pricing)  

cov = cluster_info[0]
icov = cluster_info[1] 
labels = cluster_info[2] 
counts = cluster_info[3] 
cluster_map = cluster_info[4]

Number of clusters detected: 31
Cluster max size: 28

3 cluster labels with the highest counts=[11, 29, 14]
3 highest counts=[28, 21, 11]

Plot the price series of stocks in smallest clusters.

3 cluster labels with the smallest counts=[13, 6, 27]
3 smallest counts=[2, 2, 3]
Tickers in cluster 13: ['IPG', 'OMC']
Tickers in cluster 6: ['CHH', 'SBUX']
Tickers in cluster 27: ['FOXA', 'FOX', 'DIS']
In [826]:
embed0, embed1, embedding = get_LLE_embeddings(returns)   
embedding.shape=(111, 2)
embed0.shape=(111,)
embed1.shape=(111,)
In [827]:
fig = plt.figure(1, facecolor='w', figsize=(10, 8))

do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
In [828]:
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig) 
In [829]:
pair_data = aggregate_pairs_allclusters (
            universe,
            pricing, 
            cluster_map, 
            counts)

pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
We found 107 pairs.
Containing 58 unique tickers.
Fraction (on the average) of pairs that cointegrate = 0.122808
In [830]:
fig = plt.figure(1, facecolor='w', figsize=(8, 6))  

visualize_cointegrated_pairs(
    returns.T,
    returns, 
    embedding,
    pairs_allclusters, 
    cluster_map, 
    fig)
Size of map: (58,)
X_pairs.shape=(58, 2009)
In [831]:
df = partially_correlated_pairs(universe, returns, icov, 0.4)
In [832]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 2
Number of conintegrated pairs = 1
Fraction of partially corr pairs that cointegrate = 0.500

Energy

In [38]:
universe = "Energy"
 
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
returns.shape=(2009, 78)
pricing.shape=(2010, 78)

Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.

In [39]:
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " % 
       (ticker_count*(ticker_count-1)/2)) 
Total pairs possible in universe: 3003 
In [862]:
cluster_info = build_affinity_clusters(universe, returns, pricing)  

cov = cluster_info[0]
icov = cluster_info[1] 
labels = cluster_info[2] 
counts = cluster_info[3] 
cluster_map = cluster_info[4]

Number of clusters detected: 19
Cluster max size: 54

3 cluster labels with the highest counts=[15, 3, 13]
3 highest counts=[54, 6, 2]

Plot the price series of stocks in smallest clusters.

3 cluster labels with the smallest counts=[13, 3, 15]
3 smallest counts=[2, 6, 54]
Tickers in cluster 13: ['FCEL', 'PLUG']
Tickers in cluster 3: ['BP', 'CVX', 'COP', 'EGN', 'XOM', 'OXY']
Tickers in cluster 15: ['APC', 'APA', 'BRS', 'COG', 'CPE', 'CRR', 'CRZO', 'CHK', 'CRK', 'CNX', 'DNR', 'DVN', 'DO', 'DRQ', 'ESV', 'EOG', 'EQT', 'GIFI', 'HAL', 'HLX', 'HP', 'HES', 'IO', 'KEG', 'MRO', 'MDR', 'MUR', 'NBR', 'NOV', 'NFX', 'NR', 'NBL', 'OII', 'PKD', 'PTEN', 'PDCE', 'PQ', 'PXD', 'RRC', 'RDC', 'RES', 'SLB', 'CKH', 'SM', 'SWN', 'SGY', 'SPN', 'TK', 'TTI', 'TDW', 'RIG', 'UNT', 'VLO', 'WMB']
In [863]:
embed0, embed1, embedding = get_LLE_embeddings(returns)  
embedding.shape=(78, 2)
embed0.shape=(78,)
embed1.shape=(78,)
In [864]:
fig = plt.figure(1, facecolor='w', figsize=(10, 8))

do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
In [865]:
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig) 
In [866]:
pair_data = aggregate_pairs_allclusters (
            universe,
            pricing, 
            cluster_map, 
            counts)

pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
We found 124 pairs.
Containing 54 unique tickers.
Fraction (on the average) of pairs that cointegrate = 0.383974
In [867]:
fig = plt.figure(1, facecolor='w', figsize=(8, 6))  

visualize_cointegrated_pairs(
    returns.T,
    returns, 
    embedding,
    pairs_allclusters, 
    cluster_map, 
    fig)
Size of map: (54,)
X_pairs.shape=(54, 2009)
In [868]:
df = partially_correlated_pairs(universe, returns, icov, 0.4)
In [869]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 0
In [873]:
df = partially_correlated_pairs(universe, returns, icov, 0.2)
In [874]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 5
Number of conintegrated pairs = 2
Fraction of partially corr pairs that cointegrate = 0.400

Financials

In [40]:
universe = "Financials"
 
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
returns.shape=(2009, 315)
pricing.shape=(2010, 315)

Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.

In [41]:
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " % 
       (ticker_count*(ticker_count-1)/2)) 
Total pairs possible in universe: 49455 
In [876]:
cluster_info = build_affinity_clusters(universe, returns, pricing)  

cov = cluster_info[0]
icov = cluster_info[1] 
labels = cluster_info[2] 
counts = cluster_info[3] 
cluster_map = cluster_info[4]

Number of clusters detected: 49
Cluster max size: 30

3 cluster labels with the highest counts=[7, 39, 20]
3 highest counts=[30, 25, 20]

Plot the price series of stocks in smallest clusters.

3 cluster labels with the smallest counts=[8, 5, 26]
3 smallest counts=[2, 2, 2]
Tickers in cluster 8: ['BRO', 'NKSH']
Tickers in cluster 5: ['BANF', 'PPBI']
Tickers in cluster 26: ['LKFN', 'SF']
In [877]:
embed0, embed1, embedding = get_LLE_embeddings(returns)  
embedding.shape=(315, 2)
embed0.shape=(315,)
embed1.shape=(315,)
In [878]:
fig = plt.figure(1, facecolor='w', figsize=(10, 8))

do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=30)
In [879]:
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig) 
In [880]:
pair_data = aggregate_pairs_allclusters (
            universe,
            pricing, 
            cluster_map, 
            counts)

pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
We found 88 pairs.
Containing 115 unique tickers.
Fraction (on the average) of pairs that cointegrate = 0.100680
In [881]:
fig = plt.figure(1, facecolor='w', figsize=(8, 6))  

visualize_cointegrated_pairs(
    returns.T,
    returns, 
    embedding,
    pairs_allclusters, 
    cluster_map, 
    fig)
Size of map: (115,)
X_pairs.shape=(115, 2009)
In [884]:
df = partially_correlated_pairs(universe, returns, icov, 0.4)
In [885]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 0
In [894]:
df = partially_correlated_pairs(universe, returns, icov, 0.2)
In [891]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 2
Number of conintegrated pairs = 0
Fraction of partially corr pairs that cointegrate = 0.000
In [892]:
df = partially_correlated_pairs(universe, returns, icov, 0.1)
In [893]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 22
Number of conintegrated pairs = 1
Fraction of partially corr pairs that cointegrate = 0.045

Healthcare

In [42]:
universe = "Healthcare"
 
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
returns.shape=(2009, 82)
pricing.shape=(2010, 82)

Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.

In [43]:
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " % 
       (ticker_count*(ticker_count-1)/2)) 
Total pairs possible in universe: 3321 
In [897]:
cluster_info = build_affinity_clusters(universe, returns, pricing)  

cov = cluster_info[0]
icov = cluster_info[1] 
labels = cluster_info[2] 
counts = cluster_info[3] 
cluster_map = cluster_info[4]

Number of clusters detected: 19
Cluster max size: 34

3 cluster labels with the highest counts=[18, 15, 12]
3 highest counts=[34, 13, 12]

Plot the price series of stocks in smallest clusters.

3 cluster labels with the smallest counts=[17, 9, 12]
3 smallest counts=[4, 5, 12]
Tickers in cluster 17: ['LPNT', 'DGX', 'THC', 'UHS']
Tickers in cluster 9: ['ABAX', 'BCRX', 'HUM', 'MD', 'UNH']
Tickers in cluster 12: ['ABT', 'BAX', 'BDX', 'BSX', 'BMY', 'LLY', 'JNJ', 'MDT', 'MRK', 'MYL', 'PFE', 'SYK']
In [898]:
embed0, embed1, embedding = get_LLE_embeddings(returns)   
embedding.shape=(82, 2)
embed0.shape=(82,)
embed1.shape=(82,)
In [899]:
fig = plt.figure(1, facecolor='w', figsize=(10, 8))

do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
In [900]:
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig) 
In [901]:
pair_data = aggregate_pairs_allclusters (
            universe,
            pricing, 
            cluster_map, 
            counts)

pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
We found 196 pairs.
Containing 53 unique tickers.
Fraction (on the average) of pairs that cointegrate = 0.106253
In [902]:
fig = plt.figure(1, facecolor='w', figsize=(8, 6))  

visualize_cointegrated_pairs(
    returns.T,
    returns, 
    embedding,
    pairs_allclusters, 
    cluster_map, 
    fig)
Size of map: (53,)
X_pairs.shape=(53, 2009)
In [903]:
df = partially_correlated_pairs(universe, returns, icov, 0.4)
In [904]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 0
In [906]:
df = partially_correlated_pairs(universe, returns, icov, 0.2)
In [907]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 2
Number of conintegrated pairs = 0
Fraction of partially corr pairs that cointegrate = 0.000
In [908]:
df = partially_correlated_pairs(universe, returns, icov, 0.1)
In [909]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 26
Number of conintegrated pairs = 9
Fraction of partially corr pairs that cointegrate = 0.346
In [910]:
pairs
Out[910]:
[('LLY', 'MRK'),
 ('BMY', 'LLY'),
 ('TECH', 'UTHR'),
 ('LLY', 'PFE'),
 ('ALKS', 'SYK'),
 ('PKI', 'UHS'),
 ('ALKS', 'NKTR'),
 ('MDT', 'PFE'),
 ('ALKS', 'INCY')]

Industrials

In [44]:
universe = "Industrials"
 
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
returns.shape=(2009, 157)
pricing.shape=(2010, 157)

Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.

In [45]:
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " % 
       (ticker_count*(ticker_count-1)/2)) 
Total pairs possible in universe: 12246 
In [912]:
cluster_info = build_affinity_clusters(universe, returns, pricing)  

cov = cluster_info[0]
icov = cluster_info[1] 
labels = cluster_info[2] 
counts = cluster_info[3] 
cluster_map = cluster_info[4]

Number of clusters detected: 36
Cluster max size: 83

3 cluster labels with the highest counts=[8, 0, 1]
3 highest counts=[83, 10, 8]

Plot the price series of stocks in smallest clusters.

3 cluster labels with the smallest counts=[27, 11, 34]
3 smallest counts=[2, 3, 3]
Tickers in cluster 27: ['ABM', 'MINI']
Tickers in cluster 11: ['DY', 'MTZ', 'PWR']
Tickers in cluster 34: ['RSG', 'WCN', 'WM']
In [913]:
embed0, embed1, embedding = get_LLE_embeddings(returns)   
embedding.shape=(157, 2)
embed0.shape=(157,)
embed1.shape=(157,)
In [914]:
fig = plt.figure(1, facecolor='w', figsize=(10, 8))

do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
In [915]:
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig) 
In [916]:
pair_data = aggregate_pairs_allclusters (
            universe,
            pricing, 
            cluster_map, 
            counts)

pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
We found 589 pairs.
Containing 104 unique tickers.
Fraction (on the average) of pairs that cointegrate = 0.112479
In [917]:
fig = plt.figure(1, facecolor='w', figsize=(8, 6))  

visualize_cointegrated_pairs(
    returns.T,
    returns, 
    embedding,
    pairs_allclusters, 
    cluster_map, 
    fig)
Size of map: (104,)
X_pairs.shape=(104, 2009)
In [918]:
df = partially_correlated_pairs(universe, returns, icov, 0.4)
In [919]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 0
In [920]:
df = partially_correlated_pairs(universe, returns, icov, 0.2)
In [921]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 9
Number of conintegrated pairs = 1
Fraction of partially corr pairs that cointegrate = 0.111

Non-cyclicals

In [46]:
universe = "Non-cyclicals"
 
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
returns.shape=(2009, 62)
pricing.shape=(2010, 62)

Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.

In [47]:
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " % 
       (ticker_count*(ticker_count-1)/2)) 
Total pairs possible in universe: 1891 
In [934]:
cluster_info = build_affinity_clusters(universe, returns, pricing)  

cov = cluster_info[0]
icov = cluster_info[1] 
labels = cluster_info[2] 
counts = cluster_info[3] 
cluster_map = cluster_info[4]

Number of clusters detected: 15
Cluster max size: 15

3 cluster labels with the highest counts=[1, 13, 5]
3 highest counts=[15, 9, 9]

Plot the price series of stocks in smallest clusters.

3 cluster labels with the smallest counts=[10, 14, 4]
3 smallest counts=[2, 3, 4]
Tickers in cluster 10: ['PPC', 'SAFM']
Tickers in cluster 14: ['CVS', 'KR', 'WBA']
Tickers in cluster 4: ['KO', 'CCE', 'STZ', 'PEP']
In [935]:
embed0, embed1, embedding = get_LLE_embeddings(returns)   
embedding.shape=(62, 2)
embed0.shape=(62,)
embed1.shape=(62,)
In [936]:
fig = plt.figure(1, facecolor='w', figsize=(10, 8))

do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
In [937]:
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig) 
In [938]:
pair_data = aggregate_pairs_allclusters (
            universe,
            pricing, 
            cluster_map, 
            counts)

pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
We found 26 pairs.
Containing 27 unique tickers.
Fraction (on the average) of pairs that cointegrate = 0.088492
In [939]:
fig = plt.figure(1, facecolor='w', figsize=(8, 6))  

visualize_cointegrated_pairs(
    returns.T,
    returns, 
    embedding,
    pairs_allclusters, 
    cluster_map, 
    fig)
Size of map: (27,)
X_pairs.shape=(27, 2009)
In [940]:
df = partially_correlated_pairs(universe, returns, icov, 0.4)
In [941]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 0
In [942]:
df = partially_correlated_pairs(universe, returns, icov, 0.2)
In [943]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 6
Number of conintegrated pairs = 0
Fraction of partially corr pairs that cointegrate = 0.000
In [944]:
df = partially_correlated_pairs(universe, returns, icov, 0.05)
In [945]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 99
Number of conintegrated pairs = 8
Fraction of partially corr pairs that cointegrate = 0.081

Technology

In [48]:
universe = "Technology"
 
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
returns.shape=(2009, 152)
pricing.shape=(2010, 152)

Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.

In [49]:
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " % 
       (ticker_count*(ticker_count-1)/2)) 
Total pairs possible in universe: 11476 
In [947]:
cluster_info = build_affinity_clusters(universe, returns, pricing)  

cov = cluster_info[0]
icov = cluster_info[1] 
labels = cluster_info[2] 
counts = cluster_info[3] 
cluster_map = cluster_info[4]

Number of clusters detected: 24
Cluster max size: 58

3 cluster labels with the highest counts=[23, 17, 1]
3 highest counts=[58, 9, 7]

Plot the price series of stocks in smallest clusters.

3 cluster labels with the smallest counts=[15, 16, 3]
3 smallest counts=[2, 2, 2]
Tickers in cluster 15: ['PRFT', 'TIVO']
Tickers in cluster 16: ['NSIT', 'PRGS']
Tickers in cluster 3: ['APH', 'ZIXI']
In [948]:
embed0, embed1, embedding = get_LLE_embeddings(returns)   
embedding.shape=(152, 2)
embed0.shape=(152,)
embed1.shape=(152,)
In [949]:
fig = plt.figure(1, facecolor='w', figsize=(10, 8))

do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
In [950]:
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig) 
In [951]:
pair_data = aggregate_pairs_allclusters (
            universe,
            pricing, 
            cluster_map, 
            counts)

pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
We found 1070 pairs.
Containing 105 unique tickers.
Fraction (on the average) of pairs that cointegrate = 0.263577
In [952]:
fig = plt.figure(1, facecolor='w', figsize=(8, 6))  

visualize_cointegrated_pairs(
    returns.T,
    returns, 
    embedding,
    pairs_allclusters, 
    cluster_map, 
    fig)
Size of map: (105,)
X_pairs.shape=(105, 2009)
In [953]:
df = partially_correlated_pairs(universe, returns, icov, 0.4)
In [954]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 0
In [955]:
df = partially_correlated_pairs(universe, returns, icov, 0.2)
In [956]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 1
Number of conintegrated pairs = 0
Fraction of partially corr pairs that cointegrate = 0.000
In [957]:
df = partially_correlated_pairs(universe, returns, icov, 0.1)
In [958]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 7
Number of conintegrated pairs = 5
Fraction of partially corr pairs that cointegrate = 0.714

Telecom

In [50]:
universe = "Telecom"
 
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
returns.shape=(2009, 14)
pricing.shape=(2010, 14)

Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.

In [51]:
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " % 
       (ticker_count*(ticker_count-1)/2)) 
Total pairs possible in universe: 91 
In [969]:
cluster_info = build_affinity_clusters(universe, returns, pricing)  

cov = cluster_info[0]
icov = cluster_info[1] 
labels = cluster_info[2] 
counts = cluster_info[3] 
cluster_map = cluster_info[4]

Number of clusters detected: 6
Cluster max size: 5

3 cluster labels with the highest counts=[5, 4]
3 highest counts=[5, 5]

Plot the price series of stocks in smallest clusters.

3 cluster labels with the smallest counts=[5, 4]
3 smallest counts=[5, 5]
Tickers in cluster 5: ['T', 'CTL', 'FTR', 'S', 'VZ']
Tickers in cluster 4: ['CBB', 'SBAC', 'TDS', 'USM', 'VOD']
In [970]:
embed0, embed1, embedding = get_LLE_embeddings(returns)   
embedding.shape=(14, 2)
embed0.shape=(14,)
embed1.shape=(14,)
In [971]:
fig = plt.figure(1, facecolor='w', figsize=(10, 8))

do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
In [972]:
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig) 
In [973]:
pair_data = aggregate_pairs_allclusters (
            universe,
            pricing, 
            cluster_map, 
            counts)

pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
We found 0 pairs.
Containing 0 unique tickers.
Fraction (on the average) of pairs that cointegrate = 0.000000
In [974]:
fig = plt.figure(1, facecolor='w', figsize=(8, 6))  

visualize_cointegrated_pairs(
    returns.T,
    returns, 
    embedding,
    pairs_allclusters, 
    cluster_map, 
    fig)
No pairs to visualize.
<Figure size 576x432 with 0 Axes>
In [975]:
df = partially_correlated_pairs(universe, returns, icov, 0.4)
In [976]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 2
Number of conintegrated pairs = 0
Fraction of partially corr pairs that cointegrate = 0.000

Utilities

In [52]:
universe = "Utilities"
 
returns = get_sector_returns_data(universe + ".pkl")
pricing = get_sector_pricing_data(universe + ".pkl")
returns.shape=(2009, 57)
pricing.shape=(2010, 57)

Formula $\frac{N(N - 1)}{2}$ yields the number of possible pair combinations in this stock universe.

In [53]:
# the initial dimensionality of the search was
ticker_count = len(returns.columns)
print ("Total pairs possible in universe: %d " % 
       (ticker_count*(ticker_count-1)/2)) 
Total pairs possible in universe: 1596 
In [978]:
cluster_info = build_affinity_clusters(universe, returns, pricing)  

cov = cluster_info[0]
icov = cluster_info[1] 
labels = cluster_info[2] 
counts = cluster_info[3] 
cluster_map = cluster_info[4]

Number of clusters detected: 11
Cluster max size: 31

3 cluster labels with the highest counts=[0, 10, 1]
3 highest counts=[31, 14, 3]

Plot the price series of stocks in smallest clusters.

3 cluster labels with the smallest counts=[5, 1, 10]
3 smallest counts=[2, 3, 14]
Tickers in cluster 5: ['EIX', 'PCG']
Tickers in cluster 1: ['AWR', 'WTR', 'CWT']
Tickers in cluster 10: ['ATO', 'BKH', 'EE', 'MDU', 'MGEE', 'NFG', 'NJR', 'NWN', 'OKE', 'SJI', 'SWX', 'UGI', 'VVC', 'WGL']
In [979]:
embed0, embed1, embedding = get_LLE_embeddings(returns)  
embedding.shape=(57, 2)
embed0.shape=(57,)
embed1.shape=(57,)
In [980]:
fig = plt.figure(1, facecolor='w', figsize=(10, 8))

do_scatter_plot(embed0, embed1, labels, counts, fig, dotsize=100)
In [981]:
fig = plt.figure(facecolor='w', figsize=(9, 7))
do_LLE_graph(returns, icov, labels, embedding, fig) 
In [982]:
pair_data = aggregate_pairs_allclusters (
            universe,
            pricing, 
            cluster_map, 
            counts)

pairs_allclusters = pair_data[0]
clust_dict = pair_data[1]
We found 26 pairs.
Containing 25 unique tickers.
Fraction (on the average) of pairs that cointegrate = 0.022817
In [983]:
fig = plt.figure(1, facecolor='w', figsize=(8, 6))  

visualize_cointegrated_pairs(
    returns.T,
    returns, 
    embedding,
    pairs_allclusters, 
    cluster_map, 
    fig)
Size of map: (25,)
X_pairs.shape=(25, 2009)
In [984]:
df = partially_correlated_pairs(universe, returns, icov, 0.4)
In [985]:
pairs = find_coint_partially_corr_pairs(df)
Finding cointegrated pairs of partially correlated stocks:
Number of potentially cointegrated pairs = 1
Number of conintegrated pairs = 0
Fraction of partially corr pairs that cointegrate = 0.000